R Markdown

# Install and load the required package
library(gginference)
## Warning: package 'gginference' was built under R version 4.3.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.2     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.2
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(BSDA)
## Warning: package 'BSDA' was built under R version 4.3.2
## Loading required package: lattice
## 
## Attaching package: 'BSDA'
## 
## The following object is masked from 'package:datasets':
## 
##     Orange
data <- read.csv("Obesity.csv")
data <- rename(data,Frequent_consumption_of_high_caloric_food = FAVC,Frequency_of_consumption_of_vegetables = FCVC, Number_of_main_meals = NCP, Consumption_of_food_between_meals = CAEC, Consumption_of_water_daily = CH2O, Consumption_of_alcohol = CALC,Calories_consumption_monitoring = SCC, Physical_activity_frequency = FAF, Time_using_technology_devices  = TUE, Transportation_used = MTRANS)
head(data)
##   Gender Age Height Weight family_history_with_overweight
## 1 Female  21   1.62   64.0                            yes
## 2 Female  21   1.52   56.0                            yes
## 3   Male  23   1.80   77.0                            yes
## 4   Male  27   1.80   87.0                             no
## 5   Male  22   1.78   89.8                             no
## 6   Male  29   1.62   53.0                             no
##   Frequent_consumption_of_high_caloric_food
## 1                                        no
## 2                                        no
## 3                                        no
## 4                                        no
## 5                                        no
## 6                                       yes
##   Frequency_of_consumption_of_vegetables Number_of_main_meals
## 1                                      2                    3
## 2                                      3                    3
## 3                                      2                    3
## 4                                      3                    3
## 5                                      2                    1
## 6                                      2                    3
##   Consumption_of_food_between_meals SMOKE Consumption_of_water_daily
## 1                         Sometimes    no                          2
## 2                         Sometimes   yes                          3
## 3                         Sometimes    no                          2
## 4                         Sometimes    no                          2
## 5                         Sometimes    no                          2
## 6                         Sometimes    no                          2
##   Calories_consumption_monitoring Physical_activity_frequency
## 1                              no                           0
## 2                             yes                           3
## 3                              no                           2
## 4                              no                           2
## 5                              no                           0
## 6                              no                           0
##   Time_using_technology_devices Consumption_of_alcohol   Transportation_used
## 1                             1                     no Public_Transportation
## 2                             0              Sometimes Public_Transportation
## 3                             1             Frequently Public_Transportation
## 4                             0             Frequently               Walking
## 5                             0              Sometimes Public_Transportation
## 6                             0              Sometimes            Automobile
##            NObeyesdad
## 1       Normal_Weight
## 2       Normal_Weight
## 3       Normal_Weight
## 4  Overweight_Level_I
## 5 Overweight_Level_II
## 6       Normal_Weight
data$obesity_categories <- ifelse(data$NObeyesdad %in% c("Insufficient_Weight", "Normal_Weight"), "Normal", (ifelse(data$NObeyesdad %in% c("Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"), "Obese", "Overweight")))

head(data)
##   Gender Age Height Weight family_history_with_overweight
## 1 Female  21   1.62   64.0                            yes
## 2 Female  21   1.52   56.0                            yes
## 3   Male  23   1.80   77.0                            yes
## 4   Male  27   1.80   87.0                             no
## 5   Male  22   1.78   89.8                             no
## 6   Male  29   1.62   53.0                             no
##   Frequent_consumption_of_high_caloric_food
## 1                                        no
## 2                                        no
## 3                                        no
## 4                                        no
## 5                                        no
## 6                                       yes
##   Frequency_of_consumption_of_vegetables Number_of_main_meals
## 1                                      2                    3
## 2                                      3                    3
## 3                                      2                    3
## 4                                      3                    3
## 5                                      2                    1
## 6                                      2                    3
##   Consumption_of_food_between_meals SMOKE Consumption_of_water_daily
## 1                         Sometimes    no                          2
## 2                         Sometimes   yes                          3
## 3                         Sometimes    no                          2
## 4                         Sometimes    no                          2
## 5                         Sometimes    no                          2
## 6                         Sometimes    no                          2
##   Calories_consumption_monitoring Physical_activity_frequency
## 1                              no                           0
## 2                             yes                           3
## 3                              no                           2
## 4                              no                           2
## 5                              no                           0
## 6                              no                           0
##   Time_using_technology_devices Consumption_of_alcohol   Transportation_used
## 1                             1                     no Public_Transportation
## 2                             0              Sometimes Public_Transportation
## 3                             1             Frequently Public_Transportation
## 4                             0             Frequently               Walking
## 5                             0              Sometimes Public_Transportation
## 6                             0              Sometimes            Automobile
##            NObeyesdad obesity_categories
## 1       Normal_Weight             Normal
## 2       Normal_Weight             Normal
## 3       Normal_Weight             Normal
## 4  Overweight_Level_I         Overweight
## 5 Overweight_Level_II         Overweight
## 6       Normal_Weight             Normal
ggplot(data, aes(x = Weight, fill = NObeyesdad)) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot of Age with Obesity Type",
       x = "Age",
       y = "Density") +
  theme_minimal() +
  facet_wrap(~NObeyesdad, scales = "free")

histogram <- plot_ly(data, x = ~obesity_categories, type = "histogram",color = ~family_history_with_overweight, facet_col = ~family_history_with_overweight,colors = c("steelblue","coral")) %>%
  layout(title = "Histogram of Obesity Type",
         xaxis = list(title = "Obesity Type"),
         yaxis = list(title = "Number of people"),barmode = "stack")


# Display the histogram
histogram
## Warning: 'histogram' objects don't have these attributes: 'facet_col'
## Valid attributes include:
## '_deprecated', 'alignmentgroup', 'autobinx', 'autobiny', 'bingroup', 'cliponaxis', 'constraintext', 'cumulative', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'histfunc', 'histnorm', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'insidetextanchor', 'insidetextfont', 'legendgroup', 'legendgrouptitle', 'legendrank', 'marker', 'meta', 'metasrc', 'name', 'nbinsx', 'nbinsy', 'offsetgroup', 'opacity', 'orientation', 'outsidetextfont', 'selected', 'selectedpoints', 'showlegend', 'stream', 'text', 'textangle', 'textfont', 'textposition', 'textsrc', 'texttemplate', 'transforms', 'type', 'uid', 'uirevision', 'unselected', 'visible', 'x', 'xaxis', 'xbins', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'yaxis', 'ybins', 'ycalendar', 'yhoverformat', 'ysrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

## Warning: 'histogram' objects don't have these attributes: 'facet_col'
## Valid attributes include:
## '_deprecated', 'alignmentgroup', 'autobinx', 'autobiny', 'bingroup', 'cliponaxis', 'constraintext', 'cumulative', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'histfunc', 'histnorm', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'insidetextanchor', 'insidetextfont', 'legendgroup', 'legendgrouptitle', 'legendrank', 'marker', 'meta', 'metasrc', 'name', 'nbinsx', 'nbinsy', 'offsetgroup', 'opacity', 'orientation', 'outsidetextfont', 'selected', 'selectedpoints', 'showlegend', 'stream', 'text', 'textangle', 'textfont', 'textposition', 'textsrc', 'texttemplate', 'transforms', 'type', 'uid', 'uirevision', 'unselected', 'visible', 'x', 'xaxis', 'xbins', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'yaxis', 'ybins', 'ycalendar', 'yhoverformat', 'ysrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
box_plot <- plot_ly(data, y = ~Height, color = ~Gender, type = "violin", box = list(visible = TRUE),colors = c("darkviolet","darkgreen")) %>%
  layout(title = "Box Plot of Height Gender-wise",
         yaxis = list(title = "Height"),
         xaxis = list(title = "Gender"))

# Display the box plot
box_plot
age_breaks <- c(0,25, 34,61)
age_labels <- c("youth (<25)","Young Adults (25-34)","Adults (>=35)")
data$AgeGroup <- cut(data$Age, breaks = age_breaks, labels = age_labels)

# avg_data <- aggregate(Time_using_technology_devices ~ AgeGroup, data = data, FUN = mean)

line_chart <- plot_ly(
  type = "box",
  x = data$AgeGroup,
  y = data$Time_using_technology_devices,
)
line_chart <- line_chart %>% layout(title = "Boxplot of time using technology devices among various age groups",yaxis = list(title ="Age Groups"),yaxis = list(title ="Time_using_technology_devices"))

line_chart
heatmap_ggplot <- ggplot(data, aes(x = AgeGroup, y = Transportation_used, fill = Physical_activity_frequency)) +
  geom_tile(fun = "max") +

  labs(title = "Heatmap of Physical Activity frequency with respect to Age Group and Mode of transportation", x = "Age Groups", y = "Transportation Modes Used", fill = "Values")
## Warning in geom_tile(fun = "max"): Ignoring unknown parameters: `fun`
# Display the ggplot2 heatmap
print(heatmap_ggplot)

pie_chart <- plot_ly(
  data,
  labels = ~Calories_consumption_monitoring,
  type = "pie"
)


pie_chart <- pie_chart %>%
  layout(title = "Pie Chart of Percentage of people monitoring calorie consumtion")

# Display the pie chart
pie_chart
data$BMI = data$Weight/(data$Height**2)

ggplot()+geom_point(aes(x=data$Age,y=data$BMI,color = data$NObeyesdad))

# 1. Null hypothesis (H0): Mean height of Males and Females is Equal
# Alternative hypothesis (Ha): There is a difference between mean heights of Male and Female

# Conducting a two-tailed t-test
Male <- data %>% filter(Gender == "Male") %>% pull(Height)
summary(Male)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.560   1.705   1.760   1.759   1.807   1.980
Female <- data %>% filter(Gender == "Female") %>% pull(Height)
summary(Female)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.450   1.597   1.640   1.643   1.700   1.843
result <- t.test(x=Male, y=Female, alternative = "two.sided")
result
## 
##  Welch Two Sample t-test
## 
## data:  Male and Female
## t = 36.13, df = 2102.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1091292 0.1216558
## sample estimates:
## mean of x mean of y 
##  1.758690  1.643298
ggttest(result)
## Warning: `geom_vline()`: Ignoring `data` because `xintercept` was provided.

# 2. Null hypothesis (H0): Proportion of smokers is more than 0.5
# Alternative hypothesis (Ha): Proportion of smokers is less than 0.5

data$SMOKE_Encoded <- ifelse(data$SMOKE == "yes", 1, 0)

# Conducting a one-tailed proportion test
prop_test_smokers <- prop.test(sum(data$SMOKE_Encoded), length(data$SMOKE_Encoded), p = 0.5, alternative = "less")

print(prop_test_smokers)
## 
##  1-sample proportions test with continuity correction
## 
## data:  sum(data$SMOKE_Encoded) out of length(data$SMOKE_Encoded), null probability 0.5
## X-squared = 1936.8, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is less than 0.5
## 95 percent confidence interval:
##  0.00000000 0.02686893
## sample estimates:
##         p 
## 0.0208432
# 3. Null hypothesis (H0): Family history of obesity has almost no influence on Obesity level
# Alternative hypothesis (Ha): Family history of obesity influences Obesity level

# A bit about the chi squared test - To determine whether there is a significant correlation between two categorical variables, this test for independence is used. It compares the observed frequencies to those that would be expected under the assumption of independence using a contingency table. After normalizing the squared differences between the observed and expected frequencies, the test statistic Chi-squared (X²) is computed.

data$obesity_categories <- ifelse(data$NObeyesdad %in% c("Insufficient_Weight", "Normal_Weight"), "Normal", (ifelse(data$NObeyesdad %in% c("Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"), "Obese", "Overweight")))

# Conducting a chi-squared test
contingency_table <- table(data$obesity_categories, data$family_history_with_overweight)
chi_squared_result <- chisq.test(contingency_table)

print("Chi-squared Test for Independence for Obesity category and Family history of Obesity:")
## [1] "Chi-squared Test for Independence for Obesity category and Family history of Obesity:"
print(chi_squared_result)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 570.04, df = 2, p-value < 2.2e-16
ggchisqtest(chi_squared_result)

#Statistic calculation of weight and height
correlation <- cor(data$Weight, data$Height)
print(paste("Correlation between weight and height:", correlation))
## [1] "Correlation between weight and height: 0.463136116615627"
print(summary(data$BMI))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   24.33   28.72   29.70   36.02   50.81
#4. Null hypothesis (H0): Mean of people not monitoring calorie intake have higher BMI than the  population mean

# Alternative hypothesis (Ha): Mean of people monitoring calorie intake have lower BMI than the  population mean

# Conducting a one-tailed two sample t-test

Monitoring_no <- data %>% filter(Calories_consumption_monitoring == "no") %>% pull(BMI)
summary(Monitoring_no)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   24.77   29.30   30.02   36.21   50.81
Monitoring_yes <- data %>% filter(Calories_consumption_monitoring == "yes") %>% pull(BMI)
summary(Monitoring_yes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.79   19.32   24.19   22.94   25.39   36.16
result_monitoring <- t.test(x=Monitoring_no,y=Monitoring_yes, alternative = "less",mu=mean(data$BMI))
result_monitoring
## 
##  Welch Two Sample t-test
## 
## data:  Monitoring_no and Monitoring_yes
## t = -50.327, df = 133.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 29.70016
## 95 percent confidence interval:
##      -Inf 7.828825
## sample estimates:
## mean of x mean of y 
##  30.02233  22.93782
ggttest(result_monitoring) 
## Warning: `geom_vline()`: Ignoring `data` because `xintercept` was provided.

#5. Null hypothesis (H0): Frequency of alcohol consumption has almost no influence on Obesity level
# Alternative hypothesis (Ha): Frequency of alcohol consumption influences Obesity level

# Conducting a chi-squared test
contingency_table <- table(data$obesity_categories, data$Consumption_of_alcohol)
chi_squared_result <- chisq.test(contingency_table)
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
print("Chi-squared Test for Independence for Obesity category and Family history of Obesity:")
## [1] "Chi-squared Test for Independence for Obesity category and Family history of Obesity:"
print(chi_squared_result)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 70.548, df = 6, p-value = 3.157e-13
ggchisqtest(chi_squared_result)

# Classifying task

# Split the data set into training and testing sets
set.seed(456)
train_indices <- sample(1:nrow(data), 0.7 * nrow(data))
train_data <- data[train_indices, ]
test_data <- data[-train_indices, ]
# Train the Decision tree classifier
library(rpart)

print(names(data))
##  [1] "Gender"                                   
##  [2] "Age"                                      
##  [3] "Height"                                   
##  [4] "Weight"                                   
##  [5] "family_history_with_overweight"           
##  [6] "Frequent_consumption_of_high_caloric_food"
##  [7] "Frequency_of_consumption_of_vegetables"   
##  [8] "Number_of_main_meals"                     
##  [9] "Consumption_of_food_between_meals"        
## [10] "SMOKE"                                    
## [11] "Consumption_of_water_daily"               
## [12] "Calories_consumption_monitoring"          
## [13] "Physical_activity_frequency"              
## [14] "Time_using_technology_devices"            
## [15] "Consumption_of_alcohol"                   
## [16] "Transportation_used"                      
## [17] "NObeyesdad"                               
## [18] "obesity_categories"                       
## [19] "AgeGroup"                                 
## [20] "BMI"                                      
## [21] "SMOKE_Encoded"
tree_model <- rpart(NObeyesdad ~ Gender +  Age + Height + Weight+ family_history_with_overweight +           Frequent_consumption_of_high_caloric_food + Frequency_of_consumption_of_vegetables + Consumption_of_water_daily+ Number_of_main_meals + Consumption_of_food_between_meals + SMOKE+ Calories_consumption_monitoring +Physical_activity_frequency+ Time_using_technology_devices + Consumption_of_alcohol  +Transportation_used, data = train_data)
predictions <- predict(tree_model, test_data, type = "class")

# Calculate accuracy
correct_predictions <- sum(predictions == test_data$NObeyesdad)
total_samples <- length(predictions)
accuracy <- correct_predictions / total_samples

# Print accuracy
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 86.44 %"